bt benchmark: Normalize indentation around invalid_commits
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41 invalid_commits = {
42 "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
43 "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
44 "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
45 "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
46 "d0d4e0ed487ea23aaf0d023513c0a4d86901b79b", # Does not build
47 "c24f7ab4dd9edeb5e50b0070fd9d9e8691057dde", # Does not build
48 "ce67f5614a4db3b2de4d887eca52135b439b4937", # Does not build
49 "80aff5efc66679fd934cef433c0e698694748385", # Does not build
50 "f4f11e84942d36fcc8a597d226928bce2ccac4b3", # Does not build
51 "ae466a6e1b856d96cf5112a371b4df2b732503ec", # Does not build
52 }
53
54 def json_type(string):
55 """
56 Argpase type for json args.
57 We expect a base dictionary.
58 """
59 passed_json = json.loads(string)
60 if not isinstance(passed_json, dict):
61 msg = "%r is not a dict" % string
62 raise argparse.ArgumentTypeError(msg)
63 return passed_json
64
65 def graph_get_color(branch):
66 """
67 Get the color matching the branch.
68 """
69 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
70 return color[branch]
71
72
73 def graph_get_title(branch, benchmark_type):
74 """
75 Get title for graph based on benchmark type.
76 """
77 string = {"dummy": "Dummy output", "text": "Text output"}
78 return "{} - {}".format(branch, string[benchmark_type])
79
80
81 def get_client():
82 """
83 Return minio client configured.
84 """
85 return Minio(
86 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
87 )
88
89
90 def get_file(client, prefix, file_name, workdir_name):
91 """
92 Return the path of the downloaded file.
93 Return None on error
94 """
95 destination = os.path.join(workdir_name, file_name)
96 object_name = "{}/{}".format(prefix, file_name)
97 try:
98 client.fget_object(DEFAULT_BUCKET, object_name, destination)
99 except NoSuchKey:
100 return None
101
102 return destination
103
104
105 def delete_file(client, prefix, file_name):
106 """
107 Delete the file on remote.
108 """
109 object_name = "{}/{}".format(prefix, file_name)
110 try:
111 client.remove_object(DEFAULT_BUCKET, object_name)
112 except ResponseError as err:
113 print(err)
114 except NoSuchKey:
115 pass
116
117
118 def get_git_log(bt_version, cutoff, repo_path):
119 """
120 Return an ordered (older to newer) list of commits for the bt_version and
121 cutoff. WARNING: This changes the git repo HEAD.
122 """
123 repo = git.Repo(repo_path)
124 repo.git.fetch()
125 return repo.git.log(
126 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
127 ).split("\n")
128
129
130 def parse_result(result_path):
131 """
132 Parse the result file. Return a dataset of User time + System time.
133 """
134 with open(result_path) as result:
135 parsed_result = json.load(result)
136 return list(
137 map(
138 add,
139 parsed_result["User time (seconds)"],
140 parsed_result["System time (seconds)"],
141 )
142 )
143
144
145 def get_benchmark_results(client, commit, workdir):
146 """
147 Fetch the benchmark result from a certain commit across all benchmark type.
148 """
149 results = {}
150 benchmark_valid = True
151 for b_type in BENCHMARK_TYPES:
152 prefix = "/results/benchmarks/babeltrace/{}".format(b_type)
153 result_file = get_file(client, prefix, commit, workdir)
154 if not result_file:
155 """
156 Benchmark is either corrupted or not complete.
157 """
158 return None, benchmark_valid
159 results[b_type] = parse_result(result_file)
160 if all(i == 0.0 for i in results[b_type]):
161 benchmark_valid = False
162 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
163 # The dataset is valid return immediately.
164 return results, benchmark_valid
165
166
167 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
168 """
169 Plot the graph using the raw value.
170 """
171 point_x_data = []
172 outlier_x_data = []
173 point_y_data = []
174 outlier_y_data = []
175 for pos in range(len(x_data)):
176 x = x_data[pos]
177 valid_points, outliers = sanitize_dataset(y_data[pos])
178 for y in valid_points:
179 point_x_data.append(x)
180 point_y_data.append(y)
181 for y in outliers:
182 outlier_x_data.append(x)
183 outlier_y_data.append(y)
184
185 plt.plot(
186 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
187 )
188 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
189
190 ymax = 1
191 if y_data:
192 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
193 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
194 # Put latest of other branches for reference as horizontal line.
195 for l_branch, l_result in latest_values.items():
196 if not l_result or l_branch == branch:
197 continue
198 plt.axhline(
199 y=l_result,
200 label="Latest {}".format(l_branch),
201 color=graph_get_color(l_branch),
202 )
203 if l_result >= ymax:
204 ymax = 1.2 * l_result
205 ax = plt.gca()
206 plt.ylim(ymin=0, ymax=ymax)
207 plt.xticks(x_data, labels, rotation=90, family="monospace")
208 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
209 plt.ylabel("User + system time (s)")
210 plt.xlabel("Latest commits")
211 plt.legend()
212 plt.grid(True)
213
214 # Put tick on the right side
215 ax.tick_params(labeltop=False, labelright=True)
216
217 plt.tight_layout()
218 return
219
220
221 def plot_delta_between_point(
222 branch, benchmark_type, x_data, y_data, labels, latest_values
223 ):
224 """
225 Plot the graph of delta between each sequential commit.
226 """
227 local_abs_max = 100
228
229 # Transform y_data to a list of for which the reference is the first
230 # element.
231 local_y_data = []
232 for pos, y in enumerate(y_data):
233 if pos == 0:
234 local_y_data.append(0.0)
235 continue
236 local_y_data.append(y - y_data[pos - 1])
237
238 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
239
240 # Get max absolute value to align the y axis with zero in the middle.
241 if local_y_data:
242 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
243
244 plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
245
246 ax = plt.gca()
247 plt.xticks(x_data, labels, rotation=90, family="monospace")
248 plt.title(
249 graph_get_title(branch, benchmark_type) + " Delta to previous commit",
250 fontweight="bold",
251 )
252 plt.ylabel("Seconds")
253 plt.xlabel("Latest commits")
254 plt.legend()
255 plt.grid(True)
256
257 # Put tick on the right side
258 ax.tick_params(labeltop=False, labelright=True)
259
260 plt.tight_layout()
261 return
262
263
264 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
265 """
266 Plot the graph using a ratio using first point as reference (0%).
267 """
268 reference = 0.01
269 y_abs_max = 100
270
271 if y_data:
272 reference = y_data[0]
273
274 # Transform y_data to a list of ratio for which the reference is the first
275 # element.
276 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
277
278 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
279
280 # Put latest of other branches for reference as horizontal line.
281 for l_branch, l_result in latest_values.items():
282 if not l_result or l_branch == branch:
283 continue
284 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
285 print(
286 "branch {} branch {} value {} l_result {} reference {}".format(
287 branch, l_branch, ratio_l_result, l_result, reference
288 )
289 )
290 plt.axhline(
291 y=ratio_l_result,
292 label="Latest {}".format(l_branch),
293 color=graph_get_color(l_branch),
294 )
295
296 # Draw the reference line.
297 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
298
299 # Get max absolute value to align the y axis with zero in the middle.
300 if local_y_data:
301 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
302 if y_abs_max > 100:
303 y_abs_max = local_abs_max
304
305 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
306
307 ax = plt.gca()
308 percent_formatter = PercentFormatter()
309 ax.yaxis.set_major_formatter(percent_formatter)
310 ax.yaxis.set_minor_formatter(percent_formatter)
311 plt.xticks(x_data, labels, rotation=90, family="monospace")
312 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
313 plt.ylabel("Ratio")
314 plt.xlabel("Latest commits")
315 plt.legend()
316 plt.grid(True)
317
318 # Put tick on the right side
319 ax.tick_params(labeltop=False, labelright=True)
320
321 plt.tight_layout()
322 return
323
324
325 def generate_graph(branches, report_name, git_path):
326
327 # The PDF document
328 pdf_pages = PdfPages(report_name)
329
330 client = get_client()
331 branch_results = dict()
332
333 # Fetch the results for each branch.
334 for branch, cutoff in branches.items():
335 commits = get_git_log(branch, cutoff, git_path)
336 results = []
337 with tempfile.TemporaryDirectory() as workdir:
338 for commit in commits:
339 b_results, valid = get_benchmark_results(client, commit, workdir)
340 if not b_results or not valid:
341 continue
342 results.append((commit, b_results))
343 branch_results[branch] = results
344
345 for b_type in BENCHMARK_TYPES:
346 latest_values = {}
347 max_len = 0
348
349 # Find the maximum size for a series inside our series dataset.
350 # This is used later to compute the size of the actual plot (pdf).
351 # While there gather the comparison value used to draw comparison line
352 # between branches.
353 for branch, results in branch_results.items():
354 max_len = max([max_len, len(results)])
355 if results:
356 latest_values[branch] = mean(
357 sanitize_dataset(results[-1][1][b_type])[0]
358 )
359 else:
360 latest_values[branch] = None
361
362 for branch, results in branch_results.items():
363 # Create a figure instance
364 if max_len and max_len > 10:
365 width = 0.16 * max_len
366 else:
367 width = 11.69
368
369 x_data = list(range(len(results)))
370 y_data = [c[1][b_type] for c in results]
371 labels = [c[0][:8] for c in results]
372
373 fig = plt.figure(figsize=(width, 8.27), dpi=100)
374 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
375 pdf_pages.savefig(fig)
376
377 # Use the mean of each sanitize dataset here, we do not care for
378 # variance for ratio. At least not yet.
379 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
380 fig = plt.figure(figsize=(width, 8.27), dpi=100)
381 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
382 pdf_pages.savefig(fig)
383
384 fig = plt.figure(figsize=(width, 8.27), dpi=100)
385 plot_delta_between_point(
386 branch, b_type, x_data, y_data, labels, latest_values
387 )
388 pdf_pages.savefig(fig)
389
390 pdf_pages.close()
391
392
393 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
394 """
395 Lauch jobs for all missing results.
396 """
397 client = get_client()
398 commits_to_test = set()
399 for branch, cutoff in branches.items():
400 commits = [x for x in get_git_log(branch, cutoff, git_path) if x not in invalid_commits]
401 with tempfile.TemporaryDirectory() as workdir:
402 for commit in commits:
403 b_results = get_benchmark_results(client, commit, workdir)[0]
404 if b_results and not force:
405 continue
406 commits_to_test.add(commit)
407 for index, commit in enumerate(commits_to_test):
408 print("Job {}/{}".format(index+1, len(commits_to_test)))
409 lava_submit.submit(
410 commit, wait_for_completion=wait_for_completion, debug=debug
411 )
412
413
414 def main():
415 """
416 Parse arguments and execute as needed.
417 """
418 bt_branches = {
419 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
420 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
421 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
422 }
423
424 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
425 parser.add_argument(
426 "--generate-jobs", action="store_true", help="Generate and send jobs"
427 )
428 parser.add_argument(
429 "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
430 )
431 parser.add_argument(
432 "--do-not-wait-on-completion",
433 action="store_true",
434 default=False,
435 help="Wait for the completion of each jobs sent. This is useful"
436 "for the ci. Otherwise we could end up spaming the lava instance.",
437 )
438 parser.add_argument(
439 "--generate-report",
440 action="store_true",
441 help="Generate graphs and save them to pdf",
442 )
443 parser.add_argument(
444 "--report-name", default="report.pdf", help="The name of the pdf report."
445 )
446 parser.add_argument(
447 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
448 )
449 parser.add_argument(
450 "--repo-path", help="The location of the git repo to use.", required=True
451 )
452 parser.add_argument(
453 "--overwrite-branches-cutoff",
454 help="A dictionary of the form {"
455 "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
456 "jobs generation.",
457 required=False, type=json_type
458 )
459
460 args = parser.parse_args()
461
462 if args.overwrite_branches_cutoff:
463 bt_branches = args.overwrite_branches_cutoff
464
465 if not os.path.exists(args.repo_path):
466 print("Repository location does not exists.")
467 return 1
468
469 if args.generate_jobs:
470 print("Launching jobs for:")
471
472 for branch, cutoff in bt_branches.items():
473 print("\t Branch {} with cutoff {}".format(branch, cutoff))
474
475 launch_jobs(
476 bt_branches,
477 args.repo_path,
478 not args.do_not_wait_on_completion,
479 args.debug,
480 args.force_jobs,
481 )
482
483 if args.generate_report:
484 print("Generating pdf report ({}) for:".format(args.report_name))
485 for branch, cutoff in bt_branches.items():
486 print("\t Branch {} with cutoff {}".format(branch, cutoff))
487 generate_graph(bt_branches, args.report_name, args.repo_path)
488
489 return 0
490
491
492 def sanitize_dataset(dataset):
493 """
494 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
495 representative mean without outlier in it.
496 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
497 """
498 sorted_data = sorted(dataset)
499 q1, q3 = numpy.percentile(sorted_data, [25, 75])
500 iqr = q3 - q1
501 lower_bound = q1 - (1.5 * iqr)
502 upper_bound = q3 + (1.5 * iqr)
503 new_dataset = []
504 outliers = []
505 for i in dataset:
506 if lower_bound <= i <= upper_bound:
507 new_dataset.append(i)
508 else:
509 outliers.append(i)
510 return new_dataset, outliers
511
512
513 if __name__ == "__main__":
514 sys.exit(main())
This page took 0.060323 seconds and 5 git commands to generate.